CS 584-Theory and Applications of Data Mining
Time Series Forecasting Project
Importing Necessary libraries¶
In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import calendar
from shapely.geometry import Point
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from pylab import rcParams
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import sys
import itertools
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from matplotlib import pyplot as plt
%matplotlib inline
Reading the Dataset¶
In [2]:
df = pd.read_csv('Baltimore_Crime_Data.csv')
In [3]:
df.head()
Out[3]:
| CrimeDate | CrimeTime | CrimeCode | Location | Description | Inside/Outside | Weapon | Post | District | Neighborhood | Location 1 | Total Incidents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11/12/2016 | 02:35:00 | 3B | 300 SAINT PAUL PL | ROBBERY - STREET | O | NaN | 111.0 | CENTRAL | Downtown | (39.2924100000, -76.6140800000) | 1 |
| 1 | 11/12/2016 | 02:56:00 | 3CF | 800 S BROADWAY | ROBBERY - COMMERCIAL | I | FIREARM | 213.0 | SOUTHEASTERN | Fells Point | (39.2824200000, -76.5928800000) | 1 |
| 2 | 11/12/2016 | 03:00:00 | 6D | 1500 PENTWOOD RD | LARCENY FROM AUTO | O | NaN | 413.0 | NORTHEASTERN | Stonewood-Pentwood-Winston | (39.3480500000, -76.5883400000) | 1 |
| 3 | 11/12/2016 | 03:00:00 | 6D | 6600 MILTON LN | LARCENY FROM AUTO | O | NaN | 424.0 | NORTHEASTERN | Westfield | (39.3626300000, -76.5516100000) | 1 |
| 4 | 11/12/2016 | 03:00:00 | 6E | 300 W BALTIMORE ST | LARCENY | O | NaN | 111.0 | CENTRAL | Downtown | (39.2893800000, -76.6197100000) | 1 |
Data Pre-processing¶
In [4]:
df['District'].nunique()
Out[4]:
13
In [5]:
df['CrimeDate'] = df["CrimeDate"].str.cat(df["CrimeTime"], sep = " ")
In [6]:
df['CrimeDate'].loc[198664]=pd.to_datetime('09/24/2012 00:00:00')
C:\Users\tejas\AppData\Local\Temp\ipykernel_9520\2896351813.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['CrimeDate'].loc[198664]=pd.to_datetime('09/24/2012 00:00:00')
In [7]:
df[['Latitude', 'Longitude']] =df['Location 1'].str.strip('()').str.split(',', expand=True)
In [8]:
df['Latitude'] = df['Latitude'].astype(float)
df['Longitude'] = df['Longitude'].astype(float)
In [9]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'])
In [10]:
df['CrimeDate'].dt.year.unique()
Out[10]:
array([2016, 2015, 2014, 2013, 2012, 2011], dtype=int64)
In [11]:
df.dtypes
Out[11]:
CrimeDate datetime64[ns] CrimeTime object CrimeCode object Location object Description object Inside/Outside object Weapon object Post float64 District object Neighborhood object Location 1 object Total Incidents int64 Latitude float64 Longitude float64 dtype: object
In [12]:
for i in range(0,len(df)):
if df['Inside/Outside'].iloc[i]=='I':
continue
elif df['Inside/Outside'].iloc[i]=='O':
continue
elif df['Inside/Outside'].iloc[i]=='Inside':
df['Inside/Outside'].iloc[i]='I'
elif df['Inside/Outside'].iloc[i]=='Outside':
df['Inside/Outside'].iloc[i]='O'
C:\Users\tejas\AppData\Local\Temp\ipykernel_9520\1713855148.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Inside/Outside'].iloc[i]='O' C:\Users\tejas\AppData\Local\Temp\ipykernel_9520\1713855148.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Inside/Outside'].iloc[i]='I'
In [13]:
df['Inside/Outside'].unique()
Out[13]:
array(['O', 'I', nan], dtype=object)
In [14]:
df.shape
Out[14]:
(285807, 14)
In [15]:
df.describe()
Out[15]:
| Post | Total Incidents | Latitude | Longitude | |
|---|---|---|---|---|
| count | 285616.000000 | 285807.0 | 284188.000000 | 284188.000000 |
| mean | 504.234184 | 1.0 | 39.308704 | -76.617269 |
| std | 261.354783 | 0.0 | 0.061588 | 0.042107 |
| min | 0.000000 | 1.0 | 39.200410 | -76.711440 |
| 25% | 242.000000 | 1.0 | 39.288380 | -76.648020 |
| 50% | 445.000000 | 1.0 | 39.303690 | -76.613960 |
| 75% | 723.000000 | 1.0 | 39.327570 | -76.587600 |
| max | 945.000000 | 1.0 | 41.629730 | -76.517840 |
In [16]:
rcParams['figure.figsize'] = 12, 8
df['Year'] = df['CrimeDate'].dt.year
complaints_by_year = df.groupby('Year')['Total Incidents'].sum()
complaints_by_year.plot(kind='line', figsize=(12, 8))
plt.title('Complaints by Year')
plt.xlabel('Year')
plt.ylabel('Total Crimes')
plt.show()
In [17]:
df.isna().sum()
Out[17]:
CrimeDate 0 CrimeTime 0 CrimeCode 0 Location 1623 Description 0 Inside/Outside 4196 Weapon 188411 Post 191 District 58 Neighborhood 1701 Location 1 1619 Total Incidents 0 Latitude 1619 Longitude 1619 Year 0 dtype: int64
In [18]:
column_to_check = ['Latitude','Longitude']
df.dropna(subset=column_to_check, inplace=True)
In [19]:
df= df.fillna('NA')
In [20]:
df.isna().sum()
Out[20]:
CrimeDate 0 CrimeTime 0 CrimeCode 0 Location 0 Description 0 Inside/Outside 0 Weapon 0 Post 0 District 0 Neighborhood 0 Location 1 0 Total Incidents 0 Latitude 0 Longitude 0 Year 0 dtype: int64
In [21]:
df['Total Incidents'].isnull().sum()
Out[21]:
0
In [22]:
df.shape
Out[22]:
(284188, 15)
In [23]:
# Extract year and month into new columns
df['Year'] = df['CrimeDate'].dt.year
df['Month'] = df['CrimeDate'].dt.month
# Group by year and month, and count the incidents
monthly_agg = df.groupby(['Latitude','Longitude']).size().reset_index(name='Incident Count')
print(monthly_agg)
Latitude Longitude Incident Count 0 39.20041 -76.55602 6 1 39.20047 -76.55605 7 2 39.20155 -76.55021 1 3 39.20196 -76.55686 2 4 39.20208 -76.55695 1 ... ... ... ... 97946 41.62362 -76.52606 1 97947 41.62429 -76.52516 1 97948 41.62513 -76.52402 1 97949 41.62711 -76.52136 1 97950 41.62973 -76.51784 1 [97951 rows x 3 columns]
In [24]:
district_crimes = df.groupby(['Year','District']).size().reset_index(name='Incident Count')
district_crimes = district_crimes.dropna(subset=['District'])
dist= pd.pivot_table(district_crimes, values = "Incident Count", columns = "District", index = "Year")
dist
Out[24]:
| District | CENTRAL | EASTERN | Gay Street | NA | NORTHEASTERN | NORTHERN | NORTHESTERN | NORTHWESTERN | SOUTHEASTERN | SOUTHERN | SOUTHESTERN | SOUTHWESTERN | WESTERN |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | |||||||||||||
| 2011 | 6290.0 | 4348.0 | NaN | NaN | 8434.0 | 5067.0 | 8.0 | 4876.0 | 6719.0 | 6052.0 | 1.0 | 4491.0 | 4068.0 |
| 2012 | 6494.0 | 4227.0 | NaN | 1.0 | 7684.0 | 5468.0 | 5.0 | 4854.0 | 6555.0 | 5775.0 | 2.0 | 4418.0 | 3825.0 |
| 2013 | 5618.0 | 4122.0 | NaN | NaN | 7788.0 | 5652.0 | 3.0 | 5272.0 | 6950.0 | 5576.0 | 2.0 | 4248.0 | 4055.0 |
| 2014 | 5022.0 | 3648.0 | NaN | NaN | 7479.0 | 5232.0 | 2.0 | 4687.0 | 6398.0 | 5188.0 | NaN | 4353.0 | 3729.0 |
| 2015 | 5310.0 | 4087.0 | NaN | NaN | 7661.0 | 5850.0 | 9.0 | 4727.0 | 7050.0 | 5180.0 | 2.0 | 4571.0 | 3978.0 |
| 2016 | 4885.0 | 3610.0 | 1.0 | NaN | 5786.0 | 4457.0 | 9.0 | 4150.0 | 5573.0 | 5075.0 | 3.0 | 4019.0 | 3509.0 |
Visualizing Crimes by Location of Occurence (Inside/Outside)¶
In [25]:
label_counts = df['Inside/Outside'].value_counts()
# Create a pie chart.
plt.figure(figsize=(6, 6))
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Crimes by Location (Inside/Outside)')
# Display the pie chart.
plt.show()
In [26]:
def create_point(monthly_agg):
return Point(monthly_agg['Longitude'], monthly_agg['Latitude'])
monthly_agg['geometry'] = monthly_agg.apply(create_point, axis=1)
In [28]:
with open('gz_2010_us_040_00_500k.json', 'rb') as file:
data = file.read()
if data.startswith(b'StartingBytes'):
gdf_us_states = gpd.read_file('gz_2010_us_040_00_500k.json')
else:
gdf_us_states = gpd.read_file('gz_2010_us_040_00_500k.json')
In [29]:
us_map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)
folium.GeoJson(gdf_us_states).add_to(us_map)
Out[29]:
<folium.features.GeoJson at 0x2b34851ddd0>
In [30]:
# Create a MarkerCluster for efficient marker rendering
marker_cluster = MarkerCluster().add_to(us_map)
for idx, row in monthly_agg.iterrows():
incident_count = row['Incident Count']
latitude = row['Latitude']
longitude = row['Longitude']
folium.Marker([latitude, longitude], tooltip=f'{incident_count} incidents').add_to(marker_cluster)
In [31]:
us_map.save('us_map.html')
In [130]:
us_map
Out[130]:
Make this Notebook Trusted to load map: File -> Trust Notebook